By Alex Cortez, Adam Hernandez, and Alex Pearce
GitHub: https://github.com/ACollectionOfAtoms/DV_FinalProject
This data was provided by the Enigma.io database, which can be found here. It is described as such: United States Cancer Statistics (USCS) information is aggregated from thousands of local cancer registries and published by the National Program of Cancer Registries (NPCR), a division of the Centers for Disease Control and Prevention (CDC).
source("../01 Data/load_packages.R",echo=TRUE)
##
## > library("ggplot2")
##
## > library("gplots")
##
## > library("grid")
##
## > library("plyr")
##
## > library("RCurl")
##
## > library("reshape2")
##
## > library("dplyr")
##
## > library("tidyr")
##
## > library("jsonlite")
##
## > library("knitr")
source("../01 Data/cancer_tables.R",echo=TRUE)
##
## > load("../01 Data/all_tables.Rdata")
source("../02 Data Wrangling/general_age_analysis.R",echo=TRUE)
##
## > c_age <- cancer_by_age
##
## > c_age$COUNT <- as.character(c_age$COUNT)
##
## > c_age$COUNT[c_age$COUNT == "null"] <- 1
##
## > c_age$COUNT <- as.factor(c_age$COUNT)
##
## > all_age <- c_age %>% select(AGE, COUNT, EVENT_TYPE) %>%
## + filter(COUNT != "null") %>% group_by(AGE) %>% summarise(sum = sum(COUNT)) %>%
## + .... [TRUNCATED]
##
## > incidence_age <- c_age %>% select(AGE, COUNT, EVENT_TYPE) %>%
## + filter(COUNT != "null", EVENT_TYPE == "Incidence") %>% group_by(AGE) %>%
## + .... [TRUNCATED]
##
## > Mortality_age <- c_age %>% select(AGE, COUNT, EVENT_TYPE) %>%
## + filter(COUNT != "null", EVENT_TYPE == "Mortality") %>% group_by(AGE) %>%
## + .... [TRUNCATED]
##
## > all_age <- bind_rows(incidence_age, Mortality_age)
##
## > tbl_df(all_age)
## Source: local data frame [36 x 4]
##
## AGE sum cume type
## 1 <1 2555173 0.05555556 Incidence
## 2 1-4 3854005 0.11111111 Incidence
## 3 10-14 4729815 0.16666667 Incidence
## 4 15-19 6194267 0.22222222 Incidence
## 5 20-24 8932366 0.27777778 Incidence
## 6 25-29 11628032 0.33333333 Incidence
## 7 30-34 13484646 0.38888889 Incidence
## 8 35-39 17245125 0.50000000 Incidence
## 9 40-44 17571498 0.61111111 Incidence
## 10 45-49 18350855 0.83333333 Incidence
## .. ... ... ... ...
##
## > ggplot(all_age, aes(x = AGE, y = sum, fill = type)) +
## + geom_bar(stat = "identity") + scale_fill_manual(values = c("#000000",
## + "#CCCCCC") .... [TRUNCATED]
source("../02 Data Wrangling/age_site.R",echo=TRUE)
##
## > age <- cancer_by_age
##
## > age$COUNT <- as.character(age$COUNT)
##
## > age$COUNT[age$COUNT == "null"] <- 1
##
## > age$COUNT <- as.factor(age$COUNT)
##
## > breasts <- c("Female Breast", "Male Breast", "Male and Female Breast <i>in situ</i>",
## + "Female Breast, <i>in situ</i>", "Male and Female Breas ..." ... [TRUNCATED]
##
## > breast <- c(`Female Breast` = "Breast", `Male Breast` = "Breast",
## + `Male and Female Breast <i>in situ</i>` = "Breast", `Female Breast, <i>in s .... [TRUNCATED]
##
## > age$SITE <- as.character(age$SITE)
##
## > age$SITE[age$SITE %in% breasts] <- "Breast"
##
## > age$SITE <- as.factor(age$SITE)
##
## > MAP <- c(`70-74` = "Young Adult", `27-29` = "Young Adult",
## + `20-24` = "Young Adult", `17-19` = "Child", `7-14` = "Child",
## + `7-9` = "Chil ..." ... [TRUNCATED]
##
## > age <- age %>% select(AGE, SITE, EVENT_TYPE, COUNT,
## + RACE, SEX) %>% filter(COUNT != "null", SITE != "All Cancer Sites Combined",
## + SEX != .... [TRUNCATED]
##
## > age$AGE <- MAP[age$AGE]
##
## > tbl_df(age)
## Source: local data frame [124,488 x 6]
##
## AGE SITE EVENT_TYPE COUNT
## 1 Elder Adult Colon and Rectum Incidence 9993
## 2 Elder Adult Colon and Rectum Incidence 9992
## 3 Elder Adult Corpus and Uterus, NOS Mortality 999
## 4 Elder Adult Oral Cavity and Pharynx Incidence 999
## 5 Elder Adult Corpus and Uterus, NOS Mortality 999
## 6 Middle Adult Liver and Intrahepatic Bile Duct Mortality 999
## 7 Young Adult Lung and Bronchus Incidence 999
## 8 Middle Adult Colon and Rectum Mortality 999
## 9 Middle Adult Prostate Incidence 9982
## 10 Elder Adult Pancreas Incidence 998
## .. ... ... ... ...
## Variables not shown: RACE (fctr), SEX (fctr)
##
## > total <- age %>% group_by(SITE) %>% summarise(sum = sum(COUNT)) %>%
## + arrange(desc(sum)) %>% slice(1:10)
##
## > tbl_df(total)
## Source: local data frame [10 x 2]
##
## SITE sum
## 1 Brain and Other Nervous System 16690236
## 2 Leukemias 16250318
## 3 Colon and Rectum 15496776
## 4 Non-Hodgkin Lymphoma 14243447
## 5 Lung and Bronchus 13848469
## 6 Liver and Intrahepatic Bile Duct 12923969
## 7 Stomach 12816624
## 8 Kidney and Renal Pelvis 12020378
## 9 Oral Cavity and Pharynx 11153134
## 10 Breast 11053659
##
## > total_7 <- as.character(total$SITE)
##
## > total_7
## [1] "Brain and Other Nervous System" "Leukemias"
## [3] "Colon and Rectum" "Non-Hodgkin Lymphoma"
## [5] "Lung and Bronchus" "Liver and Intrahepatic Bile Duct"
## [7] "Stomach" "Kidney and Renal Pelvis"
## [9] "Oral Cavity and Pharynx" "Breast"
##
## > child <- age %>% filter(AGE == "Child") %>% group_by(SITE) %>%
## + summarise(sum = sum(COUNT)) %>% arrange(desc(sum)) %>% slice(1:7)
##
## > child_7 <- as.character(child$SITE)
##
## > yadult <- age %>% filter(AGE == "Young Adult") %>%
## + group_by(SITE) %>% summarise(sum = sum(COUNT)) %>% arrange(desc(sum)) %>%
## + slice(1:7 .... [TRUNCATED]
##
## > yadult_7 <- as.character(yadult$SITE)
##
## > madult <- age %>% filter(AGE == "Middle Adult") %>%
## + group_by(SITE) %>% summarise(sum = sum(COUNT)) %>% arrange(desc(sum)) %>%
## + slice(1: .... [TRUNCATED]
##
## > madult_7 <- as.character(madult$SITE)
##
## > eadult <- age %>% filter(AGE == "Elder Adult") %>%
## + group_by(SITE) %>% summarise(sum = sum(COUNT)) %>% arrange(desc(sum)) %>%
## + slice(1:7 .... [TRUNCATED]
##
## > eadult_7 <- as.character(eadult$SITE)
##
## > all_top <- c(child_7, yadult_7, madult_7, eadult_7,
## + total_7)
##
## > all_top <- unique(all_top)
##
## > tbl_df(age)
## Source: local data frame [124,488 x 6]
##
## AGE SITE EVENT_TYPE COUNT
## 1 Elder Adult Colon and Rectum Incidence 9993
## 2 Elder Adult Colon and Rectum Incidence 9992
## 3 Elder Adult Corpus and Uterus, NOS Mortality 999
## 4 Elder Adult Oral Cavity and Pharynx Incidence 999
## 5 Elder Adult Corpus and Uterus, NOS Mortality 999
## 6 Middle Adult Liver and Intrahepatic Bile Duct Mortality 999
## 7 Young Adult Lung and Bronchus Incidence 999
## 8 Middle Adult Colon and Rectum Mortality 999
## 9 Middle Adult Prostate Incidence 9982
## 10 Elder Adult Pancreas Incidence 998
## .. ... ... ... ...
## Variables not shown: RACE (fctr), SEX (fctr)
##
## > age <- filter(age, SITE %in% all_top) %>% group_by(AGE,
## + SITE, EVENT_TYPE, RACE, SEX)
##
## > ggplot(age, aes(AGE), weight = COUNT, fill = SITE) +
## + scale_x_discrete(limits = c("Child", "Young Adult", "Middle Adult",
## + "Elder Ad ..." ... [TRUNCATED]
source("../02 Data Wrangling/age_site2.R",echo=TRUE)
##
## > age <- cancer_by_age
##
## > breasts <- c("Female Breast", "Male Breast", "Male and Female Breast <i>in situ</i>",
## + "Female Breast, <i>in situ</i>", "Male and Female Breas ..." ... [TRUNCATED]
##
## > breast <- c(`Female Breast` = "Breast", `Male Breast` = "Breast",
## + `Male and Female Breast <i>in situ</i>` = "Breast", `Female Breast, <i>in s .... [TRUNCATED]
##
## > age$SITE <- as.character(age$SITE)
##
## > age$SITE[age$SITE %in% breasts] <- "Breast"
##
## > age$SITE <- as.factor(age$SITE)
##
## > MAP <- c(`70-74` = "Young Adult", `27-29` = "Young Adult",
## + `20-24` = "Young Adult", `17-19` = "Child", `7-14` = "Child",
## + `7-9` = "Chil ..." ... [TRUNCATED]
##
## > age <- age %>% select(AGE, SITE, EVENT_TYPE, COUNT,
## + RACE, SEX) %>% filter(COUNT != "null", SITE != "All Cancer Sites Combined",
## + SEX != .... [TRUNCATED]
##
## > age$AGE <- MAP[age$AGE]
##
## > tbl_df(age)
## Source: local data frame [54,643 x 6]
##
## AGE SITE EVENT_TYPE COUNT
## 1 Elder Adult Colon and Rectum Incidence 9993
## 2 Elder Adult Colon and Rectum Incidence 9992
## 3 Elder Adult Corpus and Uterus, NOS Mortality 999
## 4 Elder Adult Oral Cavity and Pharynx Incidence 999
## 5 Elder Adult Corpus and Uterus, NOS Mortality 999
## 6 Middle Adult Liver and Intrahepatic Bile Duct Mortality 999
## 7 Young Adult Lung and Bronchus Incidence 999
## 8 Middle Adult Colon and Rectum Mortality 999
## 9 Middle Adult Prostate Incidence 9982
## 10 Elder Adult Pancreas Incidence 998
## .. ... ... ... ...
## Variables not shown: RACE (fctr), SEX (fctr)
##
## > total <- age %>% group_by(SITE) %>% summarise(sum = sum(COUNT)) %>%
## + arrange(desc(sum)) %>% slice(1:10)
##
## > tbl_df(total)
## Source: local data frame [10 x 2]
##
## SITE sum
## 1 Brain and Other Nervous System 16684764
## 2 Leukemias 16244846
## 3 Colon and Rectum 15491304
## 4 Non-Hodgkin Lymphoma 14237975
## 5 Lung and Bronchus 13842997
## 6 Liver and Intrahepatic Bile Duct 12918497
## 7 Stomach 12811152
## 8 Kidney and Renal Pelvis 12014906
## 9 Oral Cavity and Pharynx 11147662
## 10 Breast 11046819
##
## > total_7 <- as.character(total$SITE)
##
## > total_7
## [1] "Brain and Other Nervous System" "Leukemias"
## [3] "Colon and Rectum" "Non-Hodgkin Lymphoma"
## [5] "Lung and Bronchus" "Liver and Intrahepatic Bile Duct"
## [7] "Stomach" "Kidney and Renal Pelvis"
## [9] "Oral Cavity and Pharynx" "Breast"
##
## > child <- age %>% filter(AGE == "Child") %>% group_by(SITE) %>%
## + summarise(sum = sum(COUNT)) %>% arrange(desc(sum)) %>% slice(1:7)
##
## > child_7 <- as.character(child$SITE)
##
## > yadult <- age %>% filter(AGE == "Young Adult") %>%
## + group_by(SITE) %>% summarise(sum = sum(COUNT)) %>% arrange(desc(sum)) %>%
## + slice(1:7 .... [TRUNCATED]
##
## > yadult_7 <- as.character(yadult$SITE)
##
## > madult <- age %>% filter(AGE == "Middle Adult") %>%
## + group_by(SITE) %>% summarise(sum = sum(COUNT)) %>% arrange(desc(sum)) %>%
## + slice(1: .... [TRUNCATED]
##
## > madult_7 <- as.character(madult$SITE)
##
## > eadult <- age %>% filter(AGE == "Elder Adult") %>%
## + group_by(SITE) %>% summarise(sum = sum(COUNT)) %>% arrange(desc(sum)) %>%
## + slice(1:7 .... [TRUNCATED]
##
## > eadult_7 <- as.character(eadult$SITE)
##
## > all_top <- c(child_7, yadult_7, madult_7, eadult_7,
## + total_7)
##
## > all_top <- unique(all_top)
##
## > tbl_df(age)
## Source: local data frame [54,643 x 6]
##
## AGE SITE EVENT_TYPE COUNT
## 1 Elder Adult Colon and Rectum Incidence 9993
## 2 Elder Adult Colon and Rectum Incidence 9992
## 3 Elder Adult Corpus and Uterus, NOS Mortality 999
## 4 Elder Adult Oral Cavity and Pharynx Incidence 999
## 5 Elder Adult Corpus and Uterus, NOS Mortality 999
## 6 Middle Adult Liver and Intrahepatic Bile Duct Mortality 999
## 7 Young Adult Lung and Bronchus Incidence 999
## 8 Middle Adult Colon and Rectum Mortality 999
## 9 Middle Adult Prostate Incidence 9982
## 10 Elder Adult Pancreas Incidence 998
## .. ... ... ... ...
## Variables not shown: RACE (fctr), SEX (fctr)
##
## > age <- filter(age, SITE %in% all_top) %>% group_by(AGE,
## + SITE, EVENT_TYPE, RACE, SEX)
##
## > ggplot(age, aes(AGE), weight = COUNT, fill = SITE) +
## + scale_x_discrete(limits = c("Child", "Young Adult", "Middle Adult",
## + "Elder Ad ..." ... [TRUNCATED]
source("../02 Data Wrangling/age_site3.R",echo=TRUE)
##
## > age <- cancer_by_age
##
## > breasts <- c("Female Breast", "Male Breast", "Male and Female Breast <i>in situ</i>",
## + "Female Breast, <i>in situ</i>", "Male and Female Breas ..." ... [TRUNCATED]
##
## > breast <- c(`Female Breast` = "Breast", `Male Breast` = "Breast",
## + `Male and Female Breast <i>in situ</i>` = "Breast", `Female Breast, <i>in s .... [TRUNCATED]
##
## > age$SITE <- as.character(age$SITE)
##
## > age$SITE[age$SITE %in% breasts] <- "Breast"
##
## > age$SITE <- as.factor(age$SITE)
##
## > MAP <- c(`70-74` = "Young Adult", `27-29` = "Young Adult",
## + `20-24` = "Young Adult", `17-19` = "Child", `7-14` = "Child",
## + `7-9` = "Chil ..." ... [TRUNCATED]
##
## > age <- age %>% select(AGE, SITE, EVENT_TYPE, COUNT,
## + RACE, SEX) %>% filter(COUNT != "null", SITE != "All Cancer Sites Combined",
## + SEX != .... [TRUNCATED]
##
## > age$AGE <- MAP[age$AGE]
##
## > tbl_df(age)
## Source: local data frame [54,643 x 6]
##
## AGE SITE EVENT_TYPE COUNT
## 1 Elder Adult Colon and Rectum Incidence 9993
## 2 Elder Adult Colon and Rectum Incidence 9992
## 3 Elder Adult Corpus and Uterus, NOS Mortality 999
## 4 Elder Adult Oral Cavity and Pharynx Incidence 999
## 5 Elder Adult Corpus and Uterus, NOS Mortality 999
## 6 Middle Adult Liver and Intrahepatic Bile Duct Mortality 999
## 7 Young Adult Lung and Bronchus Incidence 999
## 8 Middle Adult Colon and Rectum Mortality 999
## 9 Middle Adult Prostate Incidence 9982
## 10 Elder Adult Pancreas Incidence 998
## .. ... ... ... ...
## Variables not shown: RACE (fctr), SEX (fctr)
##
## > total <- age %>% group_by(SITE) %>% summarise(sum = sum(COUNT)) %>%
## + arrange(desc(sum)) %>% slice(1:10)
##
## > tbl_df(total)
## Source: local data frame [10 x 2]
##
## SITE sum
## 1 Brain and Other Nervous System 16684764
## 2 Leukemias 16244846
## 3 Colon and Rectum 15491304
## 4 Non-Hodgkin Lymphoma 14237975
## 5 Lung and Bronchus 13842997
## 6 Liver and Intrahepatic Bile Duct 12918497
## 7 Stomach 12811152
## 8 Kidney and Renal Pelvis 12014906
## 9 Oral Cavity and Pharynx 11147662
## 10 Breast 11046819
##
## > total_7 <- as.character(total$SITE)
##
## > total_7
## [1] "Brain and Other Nervous System" "Leukemias"
## [3] "Colon and Rectum" "Non-Hodgkin Lymphoma"
## [5] "Lung and Bronchus" "Liver and Intrahepatic Bile Duct"
## [7] "Stomach" "Kidney and Renal Pelvis"
## [9] "Oral Cavity and Pharynx" "Breast"
##
## > child <- age %>% filter(AGE == "Child") %>% group_by(SITE) %>%
## + summarise(sum = sum(COUNT)) %>% arrange(desc(sum)) %>% slice(1:7)
##
## > child_7 <- as.character(child$SITE)
##
## > yadult <- age %>% filter(AGE == "Young Adult") %>%
## + group_by(SITE) %>% summarise(sum = sum(COUNT)) %>% arrange(desc(sum)) %>%
## + slice(1:7 .... [TRUNCATED]
##
## > yadult_7 <- as.character(yadult$SITE)
##
## > madult <- age %>% filter(AGE == "Middle Adult") %>%
## + group_by(SITE) %>% summarise(sum = sum(COUNT)) %>% arrange(desc(sum)) %>%
## + slice(1: .... [TRUNCATED]
##
## > madult_7 <- as.character(madult$SITE)
##
## > eadult <- age %>% filter(AGE == "Elder Adult") %>%
## + group_by(SITE) %>% summarise(sum = sum(COUNT)) %>% arrange(desc(sum)) %>%
## + slice(1:7 .... [TRUNCATED]
##
## > eadult_7 <- as.character(eadult$SITE)
##
## > all_top <- c(child_7, yadult_7, madult_7, eadult_7,
## + total_7)
##
## > all_top <- unique(all_top)
##
## > tbl_df(age)
## Source: local data frame [54,643 x 6]
##
## AGE SITE EVENT_TYPE COUNT
## 1 Elder Adult Colon and Rectum Incidence 9993
## 2 Elder Adult Colon and Rectum Incidence 9992
## 3 Elder Adult Corpus and Uterus, NOS Mortality 999
## 4 Elder Adult Oral Cavity and Pharynx Incidence 999
## 5 Elder Adult Corpus and Uterus, NOS Mortality 999
## 6 Middle Adult Liver and Intrahepatic Bile Duct Mortality 999
## 7 Young Adult Lung and Bronchus Incidence 999
## 8 Middle Adult Colon and Rectum Mortality 999
## 9 Middle Adult Prostate Incidence 9982
## 10 Elder Adult Pancreas Incidence 998
## .. ... ... ... ...
## Variables not shown: RACE (fctr), SEX (fctr)
##
## > age <- filter(age, SITE %in% all_top) %>% group_by(AGE,
## + SITE, EVENT_TYPE, RACE, SEX)
##
## > ggplot(age, aes(AGE), weight = COUNT, fill = SITE) +
## + scale_x_discrete(limits = c("Child", "Young Adult", "Middle Adult",
## + "Elder Ad ..." ... [TRUNCATED]